Exercise 1. Loading the dataset (raw_data)
raw_data<-read.csv("../output/raw_counts.csv.gz")
Exercise 4.Make a histogram of counts for each of the samples Conclusion: the data is not normally distributed
library(tidyr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#change data into long format
raw_data_long_format <- raw_data %>%
pivot_longer(c(-geneID), names_to="sample", values_to="count")
#Make a histogram
ggplot(raw_data_long_format, aes(x = log2(count))) +
geom_histogram(bins = 10, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Histogram of log2(Counts)", x = "Count", y = "Frequency")
## Warning: Removed 3192188 rows containing non-finite outside the scale range
## (`stat_bin()`).
raw_data <- raw_data[rowSums(raw_data[,-1] > 10) >= 3,]#For our subsequent analyses we want to reduce the data set to only those genes with some expression. In this case we will retain genes with > 10 reads in >= 3 samples
Exercise 5.check for correlation and visualize it
library(tidyr)
library(dplyr)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(viridis)
## Loading required package: viridisLite
library(stringr)
raw_data_limited <- raw_data[1:1000, -which(colnames(raw_data) == "geneID")]
# Adjust margins: c(bottom, left, top, right)
par(mar=c(4, 4, 2, 2))
#pairs(raw_data_limited) Large gra
raw_data_cor_table <- cor(raw_data_limited, use = "complete.obs")
raw_data_cor_table[1:5, 1:5]
## bhr.02_i0 bhr.02_i1 bhr.02_i2 bhr.02_v0 bhr.02_v1
## bhr.02_i0 1.0000000 0.9716989 0.8871227 0.8928627 0.9239195
## bhr.02_i1 0.9716989 1.0000000 0.9547929 0.7934465 0.8353361
## bhr.02_i2 0.8871227 0.9547929 1.0000000 0.6127897 0.6708893
## bhr.02_v0 0.8928627 0.7934465 0.6127897 1.0000000 0.9934911
## bhr.02_v1 0.9239195 0.8353361 0.6708893 0.9934911 1.0000000
raw_data_cor_table %>% gplots::heatmap.2(dendrogram="row", trace = "none", col=viridis::viridis(25, begin=.25), margins=c(7,8))
Exercise 6.Data Normalization
sample.description_raw_data <-tibble(sample=colnames(raw_data)[-1])
data <- sample.description_raw_data%>% mutate(
Pop = str_extract(sample,"bhr|ihl|kc2|lv3|sq3|tm2|wl2|yo1"), Vern = str_extract(sample, "v0|v1|v2|i0|i1|i2"),
group = str_c(Vern, Pop, sep = "_"))
print(head(data))
## # A tibble: 6 × 4
## sample Pop Vern group
## <chr> <chr> <chr> <chr>
## 1 bhr.02_i0 bhr i0 i0_bhr
## 2 bhr.02_i1 bhr i1 i1_bhr
## 3 bhr.02_i2 bhr i2 i2_bhr
## 4 bhr.02_v0 bhr v0 v0_bhr
## 5 bhr.02_v1 bhr v1 v1_bhr
## 6 bhr.02_v2 bhr v2 v2_bhr
#convert Elevation and Vernalization into factors for testing No vern,short vern, and long vern
sample.description_data <- data%>%
mutate(Pop=factor(Pop),
Vern=factor(Vern,levels = c("v0","v1", "v2","i0","i1","i2"))) # setting the levels in this way makes "v0" the reference
sample.description_data
Exercise 6. Normalization factor and plot
#calculate the effective library size and normalization factors using the TMM method
library(edgeR)
## Loading required package: limma
counts.matrix_raw_data <- raw_data %>% select(-geneID) %>% as.matrix()
rownames(counts.matrix_raw_data) <- raw_data$geneID
dge.data_Normalization <- DGEList(counts=counts.matrix_raw_data,
group=sample.description_raw_data$group)
## Warning: Unknown or uninitialised column: `group`.
dim(dge.data_Normalization)
## [1] 29983 206
dge.data_Normalization <- calcNormFactors(dge.data_Normalization, method = "TMM")
dge.data_Normalization$sample # look at the normalization factors
mds <- plotMDS(dge.data_Normalization, top = 5000, plot = FALSE, gene.selection = "common")
mds <- cbind(sample.description_data, dim1=mds@.Data[[9]], dim2=mds@.Data[[10]])
mds %>% ggplot(aes(x=dim1, y=dim2, color=Pop, pch=Vern)) +
geom_point() +
ggtitle("MDS plot, 5000 most variable genes")
## make sure sample order is the same in the dge object and in the sample sheet
all(sample.description_data$sample==rownames(dge.data_Normalization$samples) )
## [1] TRUE
subset bhr
dge.data_Normalization.bhr <- dge.data_Normalization[,sample.description_data$Pop=="bhr"]
dge.data_Normalization.bhr$samples
Exercise 8
data_Normalization<-cpm(dge.data_Normalization)
data_Normalization_log <- log2(data_Normalization[,-1]+1)
design <- model.matrix(~Pop+Vern,data = sample.description_data)
rownames(design) <- sample.description_data$sample
head(design)
## (Intercept) Popihl Popkc2 Poplv3 Popsq3 Poptm2 Popwl2 Popyo1 Vernv1
## bhr.02_i0 1 0 0 0 0 0 0 0 0
## bhr.02_i1 1 0 0 0 0 0 0 0 0
## bhr.02_i2 1 0 0 0 0 0 0 0 0
## bhr.02_v0 1 0 0 0 0 0 0 0 0
## bhr.02_v1 1 0 0 0 0 0 0 0 1
## bhr.02_v2 1 0 0 0 0 0 0 0 0
## Vernv2 Verni0 Verni1 Verni2
## bhr.02_i0 0 1 0 0
## bhr.02_i1 0 0 1 0
## bhr.02_i2 0 0 0 1
## bhr.02_v0 0 0 0 0
## bhr.02_v1 0 0 0 0
## bhr.02_v2 1 0 0 0
dge.data_Normalization_dispersions <- estimateGLMCommonDisp(dge.data_Normalization,design,verbose = TRUE)
## Disp = 1.02231 , BCV = 1.0111
#dge.data_Normalization_dispersions <- estimateGLMTrendedDisp(dge.data_Normalization_dispersions,design)
dge.data_Normalization_dispersions <-estimateGLMTagwiseDisp(dge.data_Normalization_dispersions,design)
plotBCV(dge.data_Normalization_dispersions)
Transcriptional
library(tibble)
library(ggplot2)
library(dplyr)
plotDE_line <- function(genes, dge.data_Normalization_dispersions, sample.description_raw_data) {
tmp.data <- cpm(dge.data_Normalization_dispersions[genes,], log = TRUE, prior.count = 1)
tmp.data <- as.data.frame(t(tmp.data))
colnames(tmp.data) <- genes
tmp.data <- tmp.data %>%
rownames_to_column("sample") %>%
left_join(sample.description_raw_data, by = "sample") %>%
pivot_longer(cols = genes, names_to = "gene", values_to = "log2_cpm")
# creating a average for each range
avg.data <- tmp.data %>%
group_by(Vern, Pop, gene) %>%
summarise(
mean_log2_cpm = mean(log2_cpm),
se = sd(log2_cpm) / sqrt(n()), # Calculate standard error
.groups = 'drop'
)
# delete the "#" below if you want to see errorbar
pl <- ggplot(avg.data, aes(x = Vern, y = mean_log2_cpm, color = Pop, group = Pop, pch = Pop)) +
geom_point() + # Plots the averaged points
geom_line() + # Connects the points with a line
#geom_errorbar(aes(ymin = mean_log2_cpm - se, ymax = mean_log2_cpm + se), width = 0.2) +
facet_wrap(~ gene) +
ylab("Average log2(cpm)") +
xlab("Genotype") +
scale_color_viridis_d()
return(pl)
}
#FLC gene
plotDE_line("Sdiv_ptg000009l_0614-R",dge.data_Normalization_dispersions,sample.description_data)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(genes)
##
## # Now:
## data %>% select(all_of(genes))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
plotDE_line("Sdiv_ptg000010l_2315-R",dge.data_Normalization_dispersions,sample.description_data)
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
#plotDE_boxplot("Sdiv_ptg000009l_0614-R",dge.data_Normalization_dispersions,sample.description_data)
#plotDE_boxplot("Sdiv_ptg000010l_2315-R",dge.data_Normalization_dispersions,sample.description_data)
t -test for v2 from plotDE_line_Sdiv_ptg000009l_0614
H0: (meanV2) = (meanNotV2) HA: (meanV2) ≠(meanNotV2)
Where (meanV2) is the mean log2(cpm) for the group where Vern = V2, and (meanNotV2)is the mean log2(cpm) for all other groups combined.
P value is 0.006145, reject H0 suggesting there is a statistical significant on the V2 group on average, while using alpha = 0.05
#Average gene and t test for FLC Sdiv_ptg000009l_0614
tmp.data_Sdiv_ptg000009l_0614 <- cpm(dge.data_Normalization_dispersions["Sdiv_ptg000009l_0614-R",], log = TRUE, prior.count = 1)
tmp.data_Sdiv_ptg000009l_0614 <- as.data.frame(t(tmp.data_Sdiv_ptg000009l_0614))
colnames(tmp.data_Sdiv_ptg000009l_0614) <- "Sdiv_ptg000009l_0614-R"
tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
rownames_to_column("sample") %>%
left_join(sample.description_raw_data, by = "sample") %>%
pivot_longer(cols = "Sdiv_ptg000009l_0614-R", names_to = "gene", values_to = "log2_cpm")
tmp.data_Sdiv_ptg000009l_0614 <- left_join(tmp.data_Sdiv_ptg000009l_0614, data, by = "sample")
avg_log2_cpm_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
group_by(Pop, Vern) %>%
summarise(
avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), # Calculate average, ignoring NA values
.groups = 'drop' # Drop the grouping structure after summarization
)
print(avg_log2_cpm_Sdiv_ptg000009l_0614)
## # A tibble: 48 × 3
## Pop Vern avg_log2_cpm
## <chr> <chr> <dbl>
## 1 bhr i0 5.51
## 2 bhr i1 5.49
## 3 bhr i2 4.69
## 4 bhr v0 5.28
## 5 bhr v1 4.40
## 6 bhr v2 2.76
## 7 ihl i0 5.37
## 8 ihl i1 4.43
## 9 ihl i2 2.40
## 10 ihl v0 5.22
## # ℹ 38 more rows
# T-Test
# Create two groups
group_V2_Sdiv_ptg000009l_0614 <- avg_log2_cpm_Sdiv_ptg000009l_0614$avg_log2_cpm[avg_log2_cpm_Sdiv_ptg000009l_0614$Vern == "v2"]
group_not_V2_Sdiv_ptg000009l_0614 <- avg_log2_cpm_Sdiv_ptg000009l_0614$avg_log2_cpm[avg_log2_cpm_Sdiv_ptg000009l_0614$Vern != "v2"]
t_test_result_Sdiv_ptg000009l_0614 <- t.test(group_V2_Sdiv_ptg000009l_0614, group_not_V2_Sdiv_ptg000009l_0614, alternative = "two.sided", var.equal = FALSE)
print(t_test_result_Sdiv_ptg000009l_0614)
##
## Welch Two Sample t-test
##
## data: group_V2_Sdiv_ptg000009l_0614 and group_not_V2_Sdiv_ptg000009l_0614
## t = -3.6098, df = 8.5636, p-value = 0.006145
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.518563 -1.020362
## sample estimates:
## mean of x mean of y
## 1.463795 4.233258
t -test for v2 from plotDE_line_Sdiv_ptg000010l_2315
H0: (meanV2) = (meanNotV2) HA: (meanV2) ≠(meanNotV2)
Where (meanV2) is the mean log2(cpm) for the group where Vern = V2, and (meanNotV2)is the mean log2(cpm) for all other groups combined.
The P value is 0.07004, therefore, we fail to reject H0 suggesting V2 group is not statistical significant than other groups, while using alpha = 0.05
#Average gene and t test for FLC Sdiv_ptg000010l_2315-R
tmp.data_Sdiv_ptg000010l_2315 <- cpm(dge.data_Normalization_dispersions["Sdiv_ptg000010l_2315-R",], log = TRUE, prior.count = 1)
tmp.data_Sdiv_ptg000010l_2315 <- as.data.frame(t(tmp.data_Sdiv_ptg000010l_2315))
colnames(tmp.data_Sdiv_ptg000010l_2315) <- "Sdiv_ptg000010l_2315-R"
tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
rownames_to_column("sample") %>%
left_join(sample.description_raw_data, by = "sample") %>%
pivot_longer(cols = "Sdiv_ptg000010l_2315-R", names_to = "gene", values_to = "log2_cpm")
tmp.data_Sdiv_ptg000010l_2315 <- left_join(tmp.data_Sdiv_ptg000010l_2315, data, by = "sample")
avg_log2_cpm_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
group_by(Pop, Vern) %>%
summarise(
avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), # Calculate average, ignoring NA values
.groups = 'drop' # Drop the grouping structure after summarization
)
print(avg_log2_cpm_Sdiv_ptg000010l_2315)
## # A tibble: 48 × 3
## Pop Vern avg_log2_cpm
## <chr> <chr> <dbl>
## 1 bhr i0 4.72
## 2 bhr i1 4.77
## 3 bhr i2 4.35
## 4 bhr v0 3.92
## 5 bhr v1 4.16
## 6 bhr v2 3.68
## 7 ihl i0 4.62
## 8 ihl i1 4.68
## 9 ihl i2 2.78
## 10 ihl v0 3.80
## # ℹ 38 more rows
# T-Test
# Create two groups
group_V2_Sdiv_ptg000010l_2315 <- avg_log2_cpm_Sdiv_ptg000010l_2315$avg_log2_cpm[avg_log2_cpm_Sdiv_ptg000010l_2315$Vern == "v2"]
group_not_V2_Sdiv_ptg000010l_2315 <- avg_log2_cpm_Sdiv_ptg000010l_2315$avg_log2_cpm[avg_log2_cpm_Sdiv_ptg000010l_2315$Vern != "v2"]
t_test_result_Sdiv_ptg000010l_2315 <- t.test(group_V2_Sdiv_ptg000010l_2315, group_not_V2_Sdiv_ptg000010l_2315, alternative = "two.sided", var.equal = FALSE)
print(t_test_result_Sdiv_ptg000010l_2315)
##
## Welch Two Sample t-test
##
## data: group_V2_Sdiv_ptg000010l_2315 and group_not_V2_Sdiv_ptg000010l_2315
## t = -2.0446, df = 9.3574, p-value = 0.07004
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.3537675 0.1120589
## sample estimates:
## mean of x mean of y
## 2.737856 3.858710
V2 FLC Different Elevation Testing Sdiv_ptg000009l_0614
library(dplyr)
library(ggplot2)
tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
mutate(elevation = ifelse(Pop %in% c("tm", "ihl", "bhr", "kc"), "low",
ifelse(Pop %in% c("lv", "wl", "yo", "sq"), "high", NA)))
V2_tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
filter(Vern == "v2")
ggplot(V2_tmp.data_Sdiv_ptg000009l_0614, aes(x=elevation, y=log2_cpm)) +
geom_boxplot() +
labs(title="Boxplot of FLC level by Elevation and Vern",
x="Elevation",
y="Log2 CPM",
fill="Vern Group") +
theme_minimal()
V2 FLC Different Elevation Testing Sdiv_ptg000010l_2315
library(dplyr)
library(ggplot2)
tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
mutate(elevation = ifelse(Pop %in% c("tm", "ihl", "bhr", "kc"), "low",
ifelse(Pop %in% c("lv", "wl", "yo", "sq"), "high", NA)))
V2_tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
filter(Vern == "v2")
ggplot(V2_tmp.data_Sdiv_ptg000010l_2315, aes(x=elevation, y=log2_cpm)) +
geom_boxplot() +
labs(title="Boxplot of FLC level by Elevation and Vern",
x="Elevation",
y="Log2 CPM",
fill="Vern Group") +
theme_minimal()
Function for Separate graph Sdiv_ptg000010l_2315
library(patchwork)
#V0I0 group
V0I0_tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
filter(Vern == "v0" | Vern == "i0")
V0I0_avg_log2_cpm_Sdiv_ptg000010l_2315 <- V0I0_tmp.data_Sdiv_ptg000010l_2315 %>%
group_by(Pop, Vern) %>%
summarise(
avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), # Calculate average, ignoring NA values
.groups = 'drop' # Drop the grouping structure after summarization
)
# Manually set the factor levels to control order
V0I0_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern <- factor(V0I0_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern,
levels = c("v0", "i0"))
pl2315_1 <- ggplot(V0I0_avg_log2_cpm_Sdiv_ptg000010l_2315, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
geom_point() + # Plots the averaged points
geom_line() + # Connects the points within each group with lines
labs(title = "Sdiv_ptg000010l_2315",
x = "Treatment",
y = "Mean log2(cpm) Expression"
) +
scale_color_viridis_d() # Apply the viridis discrete color scale for aesthetic mapping
#V1I1 group
V1I1_tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
filter(Vern == "v0" | Vern == "i1"| Vern == "v1")
V1I1_avg_log2_cpm_Sdiv_ptg000010l_2315 <- V1I1_tmp.data_Sdiv_ptg000010l_2315 %>%
group_by(Pop, Vern) %>%
summarise(
avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), # Calculate average, ignoring NA values
.groups = 'drop' # Drop the grouping structure after summarization
)
# Manually set the factor levels to control order
V1I1_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern <- factor(V1I1_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern,
levels = c("v0", "v1", "i1"))
pl2315_2 <- ggplot(V1I1_avg_log2_cpm_Sdiv_ptg000010l_2315, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
geom_point() + # Plots the averaged points with different shapes per group
geom_line() + # Connects the points within each group with lines
labs(title = "Sdiv_ptg000010l_2315",
x = "Treatment",
y = "Mean_log2_cpm Expression",
) +
scale_color_viridis_d() # Apply the viridis discrete color scale for aesthetic mapping
#V2I2 group
V2I2_tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
filter(Vern == "v0" | Vern == "i2"| Vern == "v2")
V2I2_avg_log2_cpm_Sdiv_ptg000010l_2315 <- V2I2_tmp.data_Sdiv_ptg000010l_2315 %>%
group_by(Pop, Vern) %>%
summarise(
avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), # Calculate average, ignoring NA values
.groups = 'drop' # Drop the grouping structure after summarization
)
# Manually set the factor levels to control order
V2I2_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern <- factor(V2I2_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern,
levels = c("v0", "v2", "i2"))
pl2315_3 <- ggplot(V2I2_avg_log2_cpm_Sdiv_ptg000010l_2315, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
geom_point() + # Plots the averaged points with different shapes per group
geom_line() + # Connects the points within each group with lines
labs(title = "Sdiv_ptg000010l_2315",
x = "Treatment",
y = "Mean_log2_cpm Expression",
) +
scale_color_viridis_d() # Apply the viridis discrete color scale for aesthetic mapping
pl2315_1 / pl2315_2 / pl2315_3 + plot_layout(guides = "collect", axis_titles="collect")
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
Function for Separate graph Sdiv_ptg000009l_0614
#V0I0 group
V0I0_tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
filter(Vern == "v0" | Vern == "i0")
V0I0_avg_log2_cpm_Sdiv_ptg000009l_0614 <- V0I0_tmp.data_Sdiv_ptg000009l_0614 %>%
group_by(Pop, Vern) %>%
summarise(
avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), # Calculate average, ignoring NA values
.groups = 'drop' # Drop the grouping structure after summarization
)
# Manually set the factor levels to control order
V0I0_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern <- factor(V0I0_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern,
levels = c("v0", "i0"))
pl614_1 <- ggplot(V0I0_avg_log2_cpm_Sdiv_ptg000009l_0614, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
geom_point() + # Plots the averaged points
geom_line() + # Connects the points within each group with lines
labs(
x = "Treatment",
y = "Mean log2(cpm) Expression"
) +
scale_color_viridis_d() # Apply the viridis discrete color scale for aesthetic mapping
#V1I1 group
V1I1_tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
filter(Vern == "v0" | Vern == "i1"| Vern == "v1")
V1I1_avg_log2_cpm_Sdiv_ptg000009l_0614 <- V1I1_tmp.data_Sdiv_ptg000009l_0614 %>%
group_by(Pop, Vern) %>%
summarise(
avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), # Calculate average, ignoring NA values
.groups = 'drop' # Drop the grouping structure after summarization
)
# Manually set the factor levels to control order
V1I1_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern <- factor(V1I1_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern,
levels = c("v0", "v1", "i1"))
pl614_2 <- ggplot(V1I1_avg_log2_cpm_Sdiv_ptg000009l_0614, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
geom_point() + # Plots the averaged points with different shapes per group
geom_line() + # Connects the points within each group with lines
labs(
x = "Treatment",
y = "Mean_log2_cpm Expression",
) +
scale_color_viridis_d() # Apply the viridis discrete color scale for aesthetic mapping
#V2I2 group
V2I2_tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
filter(Vern == "v0" | Vern == "i2"| Vern == "v2")
V2I2_avg_log2_cpm_Sdiv_ptg000009l_0614 <- V2I2_tmp.data_Sdiv_ptg000009l_0614 %>%
group_by(Pop, Vern) %>%
summarise(
avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), # Calculate average, ignoring NA values
.groups = 'drop' # Drop the grouping structure after summarization
)
# Manually set the factor levels to control order
V2I2_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern <- factor(V2I2_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern,
levels = c("v0", "v2", "i2"))
pl614_3 <- ggplot(V2I2_avg_log2_cpm_Sdiv_ptg000009l_0614, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
geom_point() + # Plots the averaged points with different shapes per group
geom_line() + # Connects the points within each group with lines
labs(
x = "Treatment",
y = "Mean_log2_cpm Expression",
) +
scale_color_viridis_d() # Apply the viridis discrete color scale for aesthetic mapping
pl614_1 / pl614_2 / pl614_3 + plot_layout(guides = "collect", axis_titles="collect")
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
Functions for separate graphs
library(dplyr)
library(ggplot2)
library(patchwork)
plot_gene_average_expression <- function(data, gene_id, vern_levels, pop_levels) {
# Filtering and summarizing data
summarized_data <- data %>%
filter(Vern %in% vern_levels) %>%
group_by(Pop, Vern) %>%
summarise(avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), .groups = 'drop') %>%
mutate(Vern = factor(Vern, levels = vern_levels))
# Create the plot
plot <- ggplot(summarized_data, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
geom_point() +
geom_line() +
labs(
title = paste("Average expression", gene_id),
x = "Treatment",
y = "Mean log2(cpm) Expression"
) +
scale_color_viridis_d()
return(plot)
}
graphs using functions
# Example "Sdiv_ptg000009l_0614"
plot1_Sdiv_ptg000009l_0614 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000009l_0614, "Sdiv_ptg000009l_0614", c("v0", "i0"), unique(tmp.data_Sdiv_ptg000009l_0614$Pop))
plot2_Sdiv_ptg000009l_0614 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000009l_0614, "Sdiv_ptg000009l_0614", c("v0", "v1", "i1"), unique(tmp.data_Sdiv_ptg000009l_0614$Pop))
plot3_Sdiv_ptg000009l_0614 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000009l_0614, "Sdiv_ptg000009l_0614", c("v0", "v2", "i2"), unique(tmp.data_Sdiv_ptg000009l_0614$Pop))
final_plot_Sdiv_ptg000009l_0614 <- plot1_Sdiv_ptg000009l_0614 / plot2_Sdiv_ptg000009l_0614 / plot3_Sdiv_ptg000009l_0614 + plot_layout(guides = "collect") + plot_annotation(title = " Gene Sdiv_ptg000009l_0614")
print(final_plot_Sdiv_ptg000009l_0614)
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Example Sdiv_ptg000010l_2315
plot1_Sdiv_ptg000010l_2315 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000010l_2315, "Sdiv_ptg000010l_2315", c("v0", "i0"), unique(tmp.data_Sdiv_ptg000010l_2315$Pop))
plot2_Sdiv_ptg000010l_2315 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000010l_2315, "Sdiv_ptg000010l_2315", c("v0", "v1", "i1"), unique(tmp.data_Sdiv_ptg000010l_2315$Pop))
plot3_Sdiv_ptg000010l_2315 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000010l_2315, "Sdiv_ptg000010l_2315", c("v0", "v2", "i2"), unique(tmp.data_Sdiv_ptg000010l_2315$Pop))
final_plot_Sdiv_ptg000010l_2315 <- plot1_Sdiv_ptg000010l_2315 / plot2_Sdiv_ptg000010l_2315 / plot3_Sdiv_ptg000010l_2315 + plot_layout(guides = "collect") + plot_annotation(title = " Gene Sdiv_ptg000010l_2315")
print(final_plot_Sdiv_ptg000010l_2315)
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
subset bhr single comparsion v0 vs v1
dge.data_Normalization.bhr <- dge.data_Normalization [,sample.description_data $Pop=="bhr"]
dge.data_Normalization.bhr$samples
sample.description_data.bhr <- sample.description_data %>%
filter(Pop == "bhr")
design.bhr <- model.matrix(~Vern,data = sample.description_data.bhr)
rownames(design.bhr) <- sample.description_data.bhr$samples
## Warning: Unknown or uninitialised column: `samples`.
design.bhr
## (Intercept) Vernv1 Vernv2 Verni0 Verni1 Verni2
## [1,] 1 0 0 1 0 0
## [2,] 1 0 0 0 1 0
## [3,] 1 0 0 0 0 1
## [4,] 1 0 0 0 0 0
## [5,] 1 1 0 0 0 0
## [6,] 1 0 1 0 0 0
## [7,] 1 0 0 1 0 0
## [8,] 1 0 0 0 1 0
## [9,] 1 0 0 0 0 1
## [10,] 1 0 0 0 0 0
## [11,] 1 1 0 0 0 0
## [12,] 1 0 1 0 0 0
## [13,] 1 0 0 1 0 0
## [14,] 1 0 0 0 1 0
## [15,] 1 0 0 0 0 1
## [16,] 1 0 0 0 0 0
## [17,] 1 1 0 0 0 0
## [18,] 1 0 1 0 0 0
## [19,] 1 0 0 1 0 0
## [20,] 1 0 0 0 1 0
## [21,] 1 0 0 0 0 1
## [22,] 1 0 0 0 0 0
## [23,] 1 1 0 0 0 0
## [24,] 1 0 1 0 0 0
## [25,] 1 0 0 1 0 0
## [26,] 1 0 0 0 1 0
## [27,] 1 0 0 0 0 1
## [28,] 1 0 0 0 0 0
## [29,] 1 1 0 0 0 0
## [30,] 1 0 1 0 0 0
## attr(,"assign")
## [1] 0 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Vern
## [1] "contr.treatment"
dge.data_Normalization_dispersions.bhr <- estimateGLMCommonDisp(dge.data_Normalization.bhr,design.bhr,verbose = TRUE)
## Disp = 0.29412 , BCV = 0.5423
fit.bhr <- glmFit(dge.data_Normalization_dispersions.bhr,design.bhr)
gt.lrt.bhr <- glmLRT(fit.bhr,coef="Vernv1")
result_v0v1_bhr<-summary(decideTests(gt.lrt.bhr,p=0.01))
print(result_v0v1_bhr)
## Vernv1
## Down 298
## NotSig 29088
## Up 597
subset yo single comparsion
dge.data_Normalization.yo1 <- dge.data_Normalization [,sample.description_data $Pop=="yo1"]
dge.data_Normalization.yo1$samples
sample.description_data.yo1 <- sample.description_data %>%
filter(Pop == "yo1")
sample.description_data.yo1
design.yo1 <- model.matrix(~Vern,data = sample.description_data.yo1)
rownames(design.yo1) <- sample.description_data.yo1$samples
## Warning: Unknown or uninitialised column: `samples`.
design.yo1
## (Intercept) Vernv1 Vernv2 Verni0 Verni1 Verni2
## [1,] 1 0 0 1 0 0
## [2,] 1 0 0 0 1 0
## [3,] 1 0 0 0 0 1
## [4,] 1 0 0 0 0 0
## [5,] 1 1 0 0 0 0
## [6,] 1 0 1 0 0 0
## [7,] 1 0 0 1 0 0
## [8,] 1 0 0 0 1 0
## [9,] 1 0 0 0 0 0
## [10,] 1 1 0 0 0 0
## [11,] 1 0 1 0 0 0
## [12,] 1 0 0 1 0 0
## [13,] 1 0 0 0 0 1
## [14,] 1 0 0 0 0 0
## [15,] 1 1 0 0 0 0
## [16,] 1 0 1 0 0 0
## [17,] 1 0 0 1 0 0
## [18,] 1 0 0 0 0 1
## [19,] 1 0 0 0 0 0
## [20,] 1 1 0 0 0 0
## [21,] 1 0 1 0 0 0
## [22,] 1 0 0 1 0 0
## [23,] 1 0 0 0 1 0
## [24,] 1 0 0 0 0 1
## [25,] 1 0 0 0 0 0
## [26,] 1 1 0 0 0 0
## [27,] 1 0 1 0 0 0
## attr(,"assign")
## [1] 0 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Vern
## [1] "contr.treatment"
dge.data_Normalization_dispersions.yo1 <- estimateGLMCommonDisp(dge.data_Normalization.yo1,design.yo1,verbose = TRUE)
## Disp = 1.2375 , BCV = 1.1124
fit.yo1 <- glmFit(dge.data_Normalization_dispersions.yo1,design.yo1)
gt.lrt.yo1 <- glmLRT(fit.yo1,coef="Vernv1")
result_v0v1.yo1<-summary(decideTests(gt.lrt.yo1,p=0.01))
print(result_v0v1.yo1)
## Vernv1
## Down 270
## NotSig 29373
## Up 340
Loop for each population (vo vs v1)
| Population | Down | NotSig | Up |
|---|---|---|---|
| bhr | 298 | 29088 | 597 |
| ihl | 203 | 28977 | 803 |
| kc | 282 | 28978 | 723 |
| lv | 203 | 28458 | 1322 |
| sq | 637 | 29302 | 44 |
| tm | 1521 | 27924 | 538 |
| wl | 112 | 29533 | 338 |
| yo | 270 | 29373 | 340 |
pop_groups <- unique(sample.description_data$Pop)
results_list <- list()
for (pop in pop_groups) {
#Extract the Normalization data for each population
dge.data_Normalization_pop <- dge.data_Normalization[, sample.description_data$Pop == pop]
#Extract sample description data for each population
sample.description_data_pop <- sample.description_data %>% filter(Pop == pop)
#redesign the matrix with correct # column for each population
design_pop <- model.matrix(~Vern, data = sample.description_data_pop)
rownames(design_pop) <- sample.description_data_pop$samples
#using matrix to find out the Normalization dispersions for each population
dge.data_Normalization_dispersions_pop <- estimateGLMCommonDisp(dge.data_Normalization_pop, design_pop, verbose = TRUE)
#Fit the model for each population dispersion data
fit_pop <- glmFit(dge.data_Normalization_dispersions_pop, design_pop)
# Conduct likelihood ratio test for each population
gt.lrt_pop <- glmLRT(fit_pop, coef = "Vernv1")
# Summarize the results for each population (up / down regulation)
result_v0v1_pop <- summary(decideTests(gt.lrt_pop, p = 0.01))
description <- "v0v1 comparison for population: "
results_list[[pop]] <- list(
Description = paste(description, pop, sep=""),
Results = result_v0v1_pop)
}
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.29412 , BCV = 0.5423
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.15282 , BCV = 0.3909
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.41909 , BCV = 0.6474
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.78411 , BCV = 0.8855
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.80391 , BCV = 1.3431
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.24269 , BCV = 1.1148
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.04389 , BCV = 1.0217
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.2375 , BCV = 1.1124
# Print or return results
print(results_list)
## $bhr
## $bhr$Description
## [1] "v0v1 comparison for population: bhr"
##
## $bhr$Results
## Vernv1
## Down 298
## NotSig 29088
## Up 597
##
##
## $ihl
## $ihl$Description
## [1] "v0v1 comparison for population: ihl"
##
## $ihl$Results
## Vernv1
## Down 203
## NotSig 28977
## Up 803
##
##
## $kc2
## $kc2$Description
## [1] "v0v1 comparison for population: kc2"
##
## $kc2$Results
## Vernv1
## Down 282
## NotSig 28978
## Up 723
##
##
## $lv3
## $lv3$Description
## [1] "v0v1 comparison for population: lv3"
##
## $lv3$Results
## Vernv1
## Down 203
## NotSig 28458
## Up 1322
##
##
## $sq3
## $sq3$Description
## [1] "v0v1 comparison for population: sq3"
##
## $sq3$Results
## Vernv1
## Down 637
## NotSig 29302
## Up 44
##
##
## $tm2
## $tm2$Description
## [1] "v0v1 comparison for population: tm2"
##
## $tm2$Results
## Vernv1
## Down 1530
## NotSig 27905
## Up 548
##
##
## $wl2
## $wl2$Description
## [1] "v0v1 comparison for population: wl2"
##
## $wl2$Results
## Vernv1
## Down 112
## NotSig 29533
## Up 338
##
##
## $yo1
## $yo1$Description
## [1] "v0v1 comparison for population: yo1"
##
## $yo1$Results
## Vernv1
## Down 270
## NotSig 29373
## Up 340
Loop for each population (vo vs v2)
| Population | Down | NotSig | Up |
|---|---|---|---|
| bhr | 530 | 28489 | 964 |
| ihl | 368 | 28630 | 985 |
| kc | 105 | 29197 | 681 |
| lv | 1081 | 27785 | 1117 |
| sq | 707 | 29161 | 115 |
| tm | 35 | 29708 | 240 |
| wl | 223 | 29277 | 483 |
| yo | 126 | 29297 | 560 |
pop_groups <- unique(sample.description_data$Pop)
results_list <- list()
for (pop in pop_groups) {
#Extract the Normalization data for each population
dge.data_Normalization_pop <- dge.data_Normalization[, sample.description_data$Pop == pop]
#Extract sample description data for each population
sample.description_data_pop <- sample.description_data %>% filter(Pop == pop)
#redesign the matrix with correct # column for each population
design_pop <- model.matrix(~Vern, data = sample.description_data_pop)
rownames(design_pop) <- sample.description_data_pop$samples
#using matrix to find out the Normalization dispersions for each population
dge.data_Normalization_dispersions_pop <- estimateGLMCommonDisp(dge.data_Normalization_pop, design_pop, verbose = TRUE)
#Fit the model for each population dispersion data
fit_pop <- glmFit(dge.data_Normalization_dispersions_pop, design_pop)
# Conduct likelihood ratio test for each population
gt.lrt_pop <- glmLRT(fit_pop, coef = "Vernv2")
# Summarize the results for each population (up / down regulation)
result_v0v2_pop <- summary(decideTests(gt.lrt_pop, p = 0.01))
description <- "v0v2 comparison for population: "
results_list[[pop]] <- list(
Description = paste(description, pop, sep=""),
Results = result_v0v2_pop)
}
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.29412 , BCV = 0.5423
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.15282 , BCV = 0.3909
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.41909 , BCV = 0.6474
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.78411 , BCV = 0.8855
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.80391 , BCV = 1.3431
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.24269 , BCV = 1.1148
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.04389 , BCV = 1.0217
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.2375 , BCV = 1.1124
# Print or return results
print(results_list)
## $bhr
## $bhr$Description
## [1] "v0v2 comparison for population: bhr"
##
## $bhr$Results
## Vernv2
## Down 530
## NotSig 28489
## Up 964
##
##
## $ihl
## $ihl$Description
## [1] "v0v2 comparison for population: ihl"
##
## $ihl$Results
## Vernv2
## Down 368
## NotSig 28630
## Up 985
##
##
## $kc2
## $kc2$Description
## [1] "v0v2 comparison for population: kc2"
##
## $kc2$Results
## Vernv2
## Down 105
## NotSig 29197
## Up 681
##
##
## $lv3
## $lv3$Description
## [1] "v0v2 comparison for population: lv3"
##
## $lv3$Results
## Vernv2
## Down 1081
## NotSig 27785
## Up 1117
##
##
## $sq3
## $sq3$Description
## [1] "v0v2 comparison for population: sq3"
##
## $sq3$Results
## Vernv2
## Down 707
## NotSig 29161
## Up 115
##
##
## $tm2
## $tm2$Description
## [1] "v0v2 comparison for population: tm2"
##
## $tm2$Results
## Vernv2
## Down 37
## NotSig 29706
## Up 240
##
##
## $wl2
## $wl2$Description
## [1] "v0v2 comparison for population: wl2"
##
## $wl2$Results
## Vernv2
## Down 223
## NotSig 29277
## Up 483
##
##
## $yo1
## $yo1$Description
## [1] "v0v2 comparison for population: yo1"
##
## $yo1$Results
## Vernv2
## Down 126
## NotSig 29297
## Up 560
Loop for each population (v1 vs v2)
| Population | Down | NotSig | Up |
|---|---|---|---|
| bhr | 468 | 29029 | 486 |
| ihl | 374 | 29304 | 305 |
| kc | 19 | 29778 | 186 |
| lv | 1220 | 28555 | 208 |
| sq | 195 | 29605 | 183 |
| tm | 0 | 29940 | 43 |
| wl | 108 | 29689 | 186 |
| yo | 87 | 29465 | 431 |
sample.description_data_v1_reference <- data%>%
mutate(Pop=factor(Pop),
Vern=factor(Vern,levels = c("v1","v0", "v2","i0","i1","i2"))) # setting the levels in this way makes "v1" the reference
sample.description_data_v1_reference
pop_groups <- unique(sample.description_data_v1_reference$Pop)
results_list <- list()
for (pop in pop_groups) {
#Extract the Normalization data for each population
dge.data_Normalization_pop <- dge.data_Normalization[, sample.description_data_v1_reference$Pop == pop]
#Extract sample description data for each population
sample.description_data_pop <- sample.description_data_v1_reference %>% filter(Pop == pop)
#redesign the matrix with correct # column for each population
design_pop <- model.matrix(~Vern, data = sample.description_data_pop)
rownames(design_pop) <- sample.description_data_pop$samples
#using matrix to find out the Normalization dispersions for each population
dge.data_Normalization_dispersions_pop <- estimateGLMCommonDisp(dge.data_Normalization_pop, design_pop, verbose = TRUE)
#Fit the model for each population dispersion data
fit_pop <- glmFit(dge.data_Normalization_dispersions_pop, design_pop)
# Conduct likelihood ratio test for each population
gt.lrt_pop <- glmLRT(fit_pop, coef = "Vernv2")
# Summarize the results for each population (up / down regulation)
result_v1v2_pop <- summary(decideTests(gt.lrt_pop, p = 0.01))
description <- "v1v2 comparison for population: "
results_list[[pop]] <- list(
Description = paste(description, pop, sep=""),
Results = result_v1v2_pop)
}
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.29412 , BCV = 0.5423
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.15282 , BCV = 0.3909
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.41909 , BCV = 0.6474
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.78411 , BCV = 0.8855
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.80391 , BCV = 1.3431
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.24269 , BCV = 1.1148
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.04389 , BCV = 1.0217
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.2375 , BCV = 1.1124
# Print or return results
print(results_list)
## $bhr
## $bhr$Description
## [1] "v1v2 comparison for population: bhr"
##
## $bhr$Results
## Vernv2
## Down 468
## NotSig 29029
## Up 486
##
##
## $ihl
## $ihl$Description
## [1] "v1v2 comparison for population: ihl"
##
## $ihl$Results
## Vernv2
## Down 374
## NotSig 29304
## Up 305
##
##
## $kc2
## $kc2$Description
## [1] "v1v2 comparison for population: kc2"
##
## $kc2$Results
## Vernv2
## Down 19
## NotSig 29778
## Up 186
##
##
## $lv3
## $lv3$Description
## [1] "v1v2 comparison for population: lv3"
##
## $lv3$Results
## Vernv2
## Down 1220
## NotSig 28555
## Up 208
##
##
## $sq3
## $sq3$Description
## [1] "v1v2 comparison for population: sq3"
##
## $sq3$Results
## Vernv2
## Down 195
## NotSig 29605
## Up 183
##
##
## $tm2
## $tm2$Description
## [1] "v1v2 comparison for population: tm2"
##
## $tm2$Results
## Vernv2
## Down 0
## NotSig 29926
## Up 57
##
##
## $wl2
## $wl2$Description
## [1] "v1v2 comparison for population: wl2"
##
## $wl2$Results
## Vernv2
## Down 108
## NotSig 29689
## Up 186
##
##
## $yo1
## $yo1$Description
## [1] "v1v2 comparison for population: yo1"
##
## $yo1$Results
## Vernv2
## Down 87
## NotSig 29465
## Up 431
Full Table
# Load the necessary library
library(dplyr)
# Define the data frames for each comparison
vo_vs_v1_table <- data.frame(
Population = c("bhr", "ihl", "kc", "lv", "sq", "tm", "wl", "yo"),
Down = c(298, 203, 282, 203, 637, 1521, 112, 270),
NotSig = c(29088, 28977, 28978, 28458, 29302, 27924, 29533, 29373),
Up = c(597, 803, 723, 1322, 44, 538, 338, 340),
Comparison = 'vo_vs_v1',
Elevation = c("low","low","low","high","high","low","high","high")
)
vo_vs_v2_table<- data.frame(
Population = c("bhr", "ihl", "kc", "lv", "sq", "tm", "wl", "yo"),
Down = c(530, 368, 105, 1081, 707, 35, 223, 126),
NotSig = c(28489, 28630, 29197, 27785, 29161, 29708, 29277, 29297),
Up = c(964, 985, 681, 1117, 115, 240, 483, 560),
Comparison = 'vo_vs_v2',
Elevation = c("low","low","low","high","high","low","high","high")
)
v1_vs_v2_table <- data.frame(
Population = c("bhr", "ihl", "kc", "lv", "sq", "tm", "wl", "yo"),
Down = c(468, 374, 19, 1220, 195, 0, 108, 87),
NotSig = c(29029, 29304, 29778, 28555, 29605, 29940, 29689, 29465),
Up = c(486, 305, 186, 208, 183, 43, 186, 431),
Comparison = 'v1_vs_v2',
Elevation = c("low","low","low","high","high","low","high","high")
)
combined_full_data_table <- bind_rows(vo_vs_v1_table, vo_vs_v2_table, v1_vs_v2_table)
write.csv(combined_full_data_table, "combined_full_data_table_v0v1v2.csv", row.names = FALSE)
print(combined_full_data_table)
## Population Down NotSig Up Comparison Elevation
## 1 bhr 298 29088 597 vo_vs_v1 low
## 2 ihl 203 28977 803 vo_vs_v1 low
## 3 kc 282 28978 723 vo_vs_v1 low
## 4 lv 203 28458 1322 vo_vs_v1 high
## 5 sq 637 29302 44 vo_vs_v1 high
## 6 tm 1521 27924 538 vo_vs_v1 low
## 7 wl 112 29533 338 vo_vs_v1 high
## 8 yo 270 29373 340 vo_vs_v1 high
## 9 bhr 530 28489 964 vo_vs_v2 low
## 10 ihl 368 28630 985 vo_vs_v2 low
## 11 kc 105 29197 681 vo_vs_v2 low
## 12 lv 1081 27785 1117 vo_vs_v2 high
## 13 sq 707 29161 115 vo_vs_v2 high
## 14 tm 35 29708 240 vo_vs_v2 low
## 15 wl 223 29277 483 vo_vs_v2 high
## 16 yo 126 29297 560 vo_vs_v2 high
## 17 bhr 468 29029 486 v1_vs_v2 low
## 18 ihl 374 29304 305 v1_vs_v2 low
## 19 kc 19 29778 186 v1_vs_v2 low
## 20 lv 1220 28555 208 v1_vs_v2 high
## 21 sq 195 29605 183 v1_vs_v2 high
## 22 tm 0 29940 43 v1_vs_v2 low
## 23 wl 108 29689 186 v1_vs_v2 high
## 24 yo 87 29465 431 v1_vs_v2 high
Histogram of the Full
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Read the data
combined.full.histogram.data <- read.csv("combined_full_data_table_v0v1v2.csv")
# Melt the data to long format for easier plotting
combined.full.histogram.plot <- combined.full.histogram.data %>%
gather(key = "Regulation", value = "Count", -Population, -Comparison,-Elevation)
# Create a new variable for easier labeling in the plot
combined.full.histogram.plot$Population_Comparison <- paste(combined.full.histogram.plot$Population, combined.full.histogram.plot$Comparison, combined.full.histogram.plot$Elevation,sep = "/")
combined.full.histogram.plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", combined.full.histogram.plot$Population_Comparison)
combined.full.histogram.plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", combined.full.histogram.plot$Population_Comparison)
combined.full.histogram.plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", combined.full.histogram.plot$Population_Comparison)
ggplot(combined.full.histogram.plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) +
labs(title = "Gene Regulation Across Populations and Comparisons",
x = "Population (Comparison)",
y = "Count of Genes") +
theme_minimal() +
scale_fill_brewer(palette = "Dark2") + # Set a color palette
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better visibility
Group_1_tm_lv_data <- filter(combined.full.histogram.data, Population %in% c("tm", "lv"))
Group_2_ihl_wl_data <- filter(combined.full.histogram.data, Population %in% c("ihl", "wl"))
Group_3_bhr_yo_data <- filter(combined.full.histogram.data, Population %in% c("bhr", "yo"))
Group_4_kc_sq_data <- filter(combined.full.histogram.data, Population %in% c("kc", "sq"))
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr) # for gather function
# Assuming Group_1_tm_lv_data has already been created by filtering combined.full.histogram.data
# Melt the data to long format for easier plotting
group_1_plot <- Group_1_tm_lv_data %>%
gather(key = "Regulation", value = "Count", -Population, -Comparison, -Elevation)
# Create a new variable for easier labeling in the plot
group_1_plot$Population_Comparison <- paste(group_1_plot$Population, group_1_plot$Comparison, group_1_plot$Elevation, sep = "/")
# Simplify labels if necessary (depending on your specific labeling needs)
group_1_plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", group_1_plot$Population_Comparison)
group_1_plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", group_1_plot$Population_Comparison)
group_1_plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", group_1_plot$Population_Comparison)
# Plotting
ggplot(group_1_plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) +
labs(title = "Gene Regulation in Group 1 (Populations tm and lv)",
x = "Population (Comparison/Elevation)",
y = "Count of Genes") +
theme_minimal() +
scale_fill_brewer(palette = "Dark2") + # Set a color palette
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better visibility
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr) # for gather function
# Assuming Group_2_ihl_wl_data has already been created by filtering combined.full.histogram.data
# Melt the data to long format for easier plotting
group_2_plot <- Group_2_ihl_wl_data %>%
gather(key = "Regulation", value = "Count", -Population, -Comparison, -Elevation)
# Create a new variable for easier labeling in the plot
group_2_plot$Population_Comparison <- paste(group_2_plot$Population, group_2_plot$Comparison, group_2_plot$Elevation, sep = "/")
# Simplify labels if necessary (depending on your specific labeling needs)
group_2_plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", group_2_plot$Population_Comparison)
group_2_plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", group_2_plot$Population_Comparison)
group_2_plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", group_2_plot$Population_Comparison)
# Plotting
ggplot(group_2_plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) +
labs(title = "Gene Regulation in Group 2 (Populations ihl and wl)",
x = "Population (Comparison/Elevation)",
y = "Count of Genes") +
theme_minimal() +
scale_fill_brewer(palette = "Dark2") + # Set a color palette
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better visibility
# Assuming Group_3_bhr_yo_data has already been created by filtering combined.full.histogram.data
# Melt the data to long format for easier plotting
group_3_plot <- Group_3_bhr_yo_data %>%
gather(key = "Regulation", value = "Count", -Population, -Comparison, -Elevation)
# Create a new variable for easier labeling in the plot
group_3_plot$Population_Comparison <- paste(group_3_plot$Population, group_3_plot$Comparison, group_3_plot$Elevation, sep = "/")
# Simplify labels if necessary
group_3_plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", group_3_plot$Population_Comparison)
group_3_plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", group_3_plot$Population_Comparison)
group_3_plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", group_3_plot$Population_Comparison)
# Plotting
ggplot(group_3_plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) +
labs(title = "Gene Regulation in Group 3 (Populations bhr and yo)",
x = "Population (Comparison/Elevation)",
y = "Count of Genes") +
theme_minimal() +
scale_fill_brewer(palette = "Dark2") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Assuming Group_4_kc_sq_data has already been created by filtering combined.full.histogram.data
# Melt the data to long format for easier plotting
group_4_plot <- Group_4_kc_sq_data %>%
gather(key = "Regulation", value = "Count", -Population, -Comparison, -Elevation)
# Create a new variable for easier labeling in the plot
group_4_plot$Population_Comparison <- paste(group_4_plot$Population, group_4_plot$Comparison, group_4_plot$Elevation, sep = "/")
# Simplify labels if necessary
group_4_plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", group_4_plot$Population_Comparison)
group_4_plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", group_4_plot$Population_Comparison)
group_4_plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", group_4_plot$Population_Comparison)
# Plotting
ggplot(group_4_plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) +
labs(title = "Gene Regulation in Group 4 (Populations kc and sq)",
x = "Population (Comparison/Elevation)",
y = "Count of Genes") +
theme_minimal() +
scale_fill_brewer(palette = "Dark2") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))